# -*- coding: UTF-8 -*-
import urllib
import urllib2
from bs4 import BeautifulSoup
import codecs

from xlwt import *

wbk = Workbook()
sheet = wbk.add_sheet('sheet 1')

index = 1

sheet.write(0, 0, u'标题')
sheet.write(0, 1, u'评分人数')
sheet.write(0, 2, u'评分')
sheet.write(0, 3, u'年份')
sheet.write(0, 4, u'地区')
sheet.write(0, 5, u'类型')
sheet.write(0, 6, u'播放量')
sheet.write(0, 7, u'导演')
sheet.write(0, 8, u'主演')

def getURLs(url):
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page, from_encoding="gb18030")

    mod_item_tit = soup.findAll('div', 'mod_item_tit')
    for i in range(len(mod_item_tit)):
        URL = mod_item_tit[i].find('a').get('href')

        print URL
        getInfo(URL)

def getInfo(url):
    global index

    page = urllib2.urlopen(url)
    
    soup = BeautifulSoup(page, from_encoding="gb18030")
    
    print soup
    print '-'
    print soup.find('div', 'video_title')

    print '---'
    print soup.findAll('div', {'info_cast', 'actors'})
    print '**'
    #print soup.find('div', 'info_cast').children()
    #find_all(self, name=None, attrs={}, recursive=True, text=None,
    #             limit=None, **kwargs):
    print '---'
    #标题
    titlist = soup.findAll('div', 'mod_item_tit')
    print len(titlist)

    mod_item_tit = soup.findAll('div', 'mod_item_tit')
    mod_scores = soup.findAll('div', 'mod_scores')
    mod_info = soup.findAll('ul', 'mod_info')
    print mod_info

    print len(mod_item_tit), len(mod_scores),len(mod_info)

    for i in range(len(mod_item_tit)):
        title = mod_item_tit[i].find('a').getText()
        print u'标题：' + title

        score = mod_scores[i].find('strong').getText()
        print u'评分：' + score
        sheet.write(index, 2, score)
        print mod_info[i]

        year = ''.join(mod_info[i].find('li', 'date').getText().split()).replace(u'年份：', '')
        area = mod_info[i].find('li', 'area').getText().replace(u'地区：', '')
        print '$$$$$$$$$: ', mod_info[i].find(u'类型')
        print mod_info[i].find('li', 'category')
        print mod_info[i]
        types = ' '.join(mod_info[i].find('li', 'category').getText().split()).replace(u'类型： ', '')
        count = ''.join(mod_info[i].find('li', 'play_time').getText().split()).replace(u'播放：', '')
        directors = ' '.join(mod_info[i].find('li', 'director').getText().split()).replace(u'导演： ', '')
        actors = ' '.join(mod_info[i].find('li', 'performer').getText().split()).replace(u'主演： ', '')
        
        print u'年份：' + year
        sheet.write(index, 3, year)
        print u'地区：' + area
        sheet.write(index, 4, area)
        print u'类型：' + types
        sheet.write(index, 5, types)
        print u'播放：' + count
        sheet.write(index, 6, count)
        print u'导演：' + directors
        sheet.write(index, 7, directors)
        print u'主演：' + actors
        sheet.write(index, 8, actors)

        print '++++++++++++++++++++++++++++++++++++++++++++++'
        index +=1
    

'''
http://v.qq.com/list/1_-1_-1_-1_0_0_0_20_2_-1_0.html
http://v.qq.com/list/1_-1_-1_-1_0_0_1_20_2_-1_0.html
http://v.qq.com/list/1_-1_-1_-1_0_0_2_20_2_-1_0.html

http://v.qq.com/list/1_-1_-1_-1_0_1_0_10_2_-1_0.html
http://v.qq.com/list/1_-1_-1_-1_0_1_1_10_2_-1_0.html

http://v.qq.com/movielist/10002/0/0/0/0/10/1/0.html
http://v.qq.com/movielist/10002/0/0/0/0/10/1/0.html
http://v.qq.com/movielist/10002/0/0/0/1/10/1/0.html
http://v.qq.com/movielist/10002/0/0/0/3/10/1/0.html

http://v.qq.com/movielist/10002/0/0/0/0/20/0/0.html
http://v.qq.com/movielist/10002/0/0/0/0/10/1/0.html
http://v.qq.com/movielist/10002/0/0/0/1/20/0/0.html

'''

for i in range(45):
    getURLs('http://v.qq.com/movielist/10002/0/0/0/' + str(i) + '/10/1/0.html')

#getInfo(None)
##getInfo('http://v.qq.com/cover/a/alr0l0ryq56wufe.html')
#getURLs()
wbk.save('ssssss.xls')

    
